Data¶

In [476]:
import pandas as pd
import numpy as np
In [477]:
data = pd.read_csv(r"C:\Users\mekal\OneDrive\Documents\supply_chain_dataset.csv")
In [478]:
data
Out[478]:
Product_ID Warehouse Order_Date Shipment_Date Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Supplier_ID Order_Quantity Shipment_Quantity Product_Category Product_Price Customer_ID Order_Priority
0 8270 A 1/1/2023 1/2/2023 8 839 481 0 0 397 98 40 Clothing 387.89 2014 High
1 1860 C 1/1/2023 1/2/2023 8 351 332 0 1 483 52 35 Furniture 891.03 6805 High
2 6390 C 1/1/2023 1/2/2023 5 398 156 0 0 160 34 43 Food 933.21 3833 Medium
3 6191 A 1/1/2023 1/2/2023 5 972 418 1 0 161 32 45 Electronics 628.57 3640 Medium
4 6734 D 1/1/2023 1/2/2023 3 529 428 0 0 325 44 41 Electronics 70.33 9297 Medium
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8995 1822 B 1/10/2024 1/11/2024 4 426 31 0 0 127 22 15 Electronics 83.02 7220 Low
8996 3939 B 1/10/2024 1/11/2024 8 172 432 0 0 192 11 45 Clothing 116.94 9232 Medium
8997 8096 D 1/10/2024 1/11/2024 9 355 414 0 0 278 36 2 Clothing 352.30 2483 Medium
8998 6765 C 1/10/2024 1/11/2024 4 842 107 0 0 316 53 58 Clothing 361.41 6888 Medium
8999 2031 B 1/10/2024 1/11/2024 6 269 78 0 0 126 76 38 Electronics 978.42 2735 Medium

9000 rows × 16 columns

Basic Checks¶

In [479]:
print("Head of first five rows")
display(data.head())
print("Tail of last five rows")
display(data.tail())
Head of first five rows
Product_ID Warehouse Order_Date Shipment_Date Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Supplier_ID Order_Quantity Shipment_Quantity Product_Category Product_Price Customer_ID Order_Priority
0 8270 A 1/1/2023 1/2/2023 8 839 481 0 0 397 98 40 Clothing 387.89 2014 High
1 1860 C 1/1/2023 1/2/2023 8 351 332 0 1 483 52 35 Furniture 891.03 6805 High
2 6390 C 1/1/2023 1/2/2023 5 398 156 0 0 160 34 43 Food 933.21 3833 Medium
3 6191 A 1/1/2023 1/2/2023 5 972 418 1 0 161 32 45 Electronics 628.57 3640 Medium
4 6734 D 1/1/2023 1/2/2023 3 529 428 0 0 325 44 41 Electronics 70.33 9297 Medium
Tail of last five rows
Product_ID Warehouse Order_Date Shipment_Date Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Supplier_ID Order_Quantity Shipment_Quantity Product_Category Product_Price Customer_ID Order_Priority
8995 1822 B 1/10/2024 1/11/2024 4 426 31 0 0 127 22 15 Electronics 83.02 7220 Low
8996 3939 B 1/10/2024 1/11/2024 8 172 432 0 0 192 11 45 Clothing 116.94 9232 Medium
8997 8096 D 1/10/2024 1/11/2024 9 355 414 0 0 278 36 2 Clothing 352.30 2483 Medium
8998 6765 C 1/10/2024 1/11/2024 4 842 107 0 0 316 53 58 Clothing 361.41 6888 Medium
8999 2031 B 1/10/2024 1/11/2024 6 269 78 0 0 126 76 38 Electronics 978.42 2735 Medium
In [480]:
data.size
Out[480]:
144000
In [481]:
data.shape
Out[481]:
(9000, 16)
In [482]:
data.columns
Out[482]:
Index(['Product_ID', 'Warehouse', 'Order_Date', 'Shipment_Date', 'Lead_Time',
       'Demand_Forecast', 'Inventory_Level', 'Stockout_Flag', 'Backorder_Flag',
       'Supplier_ID', 'Order_Quantity', 'Shipment_Quantity',
       'Product_Category', 'Product_Price', 'Customer_ID', 'Order_Priority'],
      dtype='object')
In [483]:
data.dtypes
Out[483]:
Product_ID             int64
Warehouse             object
Order_Date            object
Shipment_Date         object
Lead_Time              int64
Demand_Forecast        int64
Inventory_Level        int64
Stockout_Flag          int64
Backorder_Flag         int64
Supplier_ID            int64
Order_Quantity         int64
Shipment_Quantity      int64
Product_Category      object
Product_Price        float64
Customer_ID            int64
Order_Priority        object
dtype: object
In [484]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Product_ID         9000 non-null   int64  
 1   Warehouse          9000 non-null   object 
 2   Order_Date         9000 non-null   object 
 3   Shipment_Date      9000 non-null   object 
 4   Lead_Time          9000 non-null   int64  
 5   Demand_Forecast    9000 non-null   int64  
 6   Inventory_Level    9000 non-null   int64  
 7   Stockout_Flag      9000 non-null   int64  
 8   Backorder_Flag     9000 non-null   int64  
 9   Supplier_ID        9000 non-null   int64  
 10  Order_Quantity     9000 non-null   int64  
 11  Shipment_Quantity  9000 non-null   int64  
 12  Product_Category   9000 non-null   object 
 13  Product_Price      9000 non-null   float64
 14  Customer_ID        9000 non-null   int64  
 15  Order_Priority     9000 non-null   object 
dtypes: float64(1), int64(10), object(5)
memory usage: 1.1+ MB

Convert text cols to Lower Case¶

In [485]:
import warnings
warnings.filterwarnings("ignore")
In [486]:
for col in data.columns:
    if data[col].dtype=='object':
        data[col] = data[col].str.lower()
In [487]:
data.head()
Out[487]:
Product_ID Warehouse Order_Date Shipment_Date Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Supplier_ID Order_Quantity Shipment_Quantity Product_Category Product_Price Customer_ID Order_Priority
0 8270 a 1/1/2023 1/2/2023 8 839 481 0 0 397 98 40 clothing 387.89 2014 high
1 1860 c 1/1/2023 1/2/2023 8 351 332 0 1 483 52 35 furniture 891.03 6805 high
2 6390 c 1/1/2023 1/2/2023 5 398 156 0 0 160 34 43 food 933.21 3833 medium
3 6191 a 1/1/2023 1/2/2023 5 972 418 1 0 161 32 45 electronics 628.57 3640 medium
4 6734 d 1/1/2023 1/2/2023 3 529 428 0 0 325 44 41 electronics 70.33 9297 medium

Checking missing values¶

In [488]:
data.isnull().sum()
Out[488]:
Product_ID           0
Warehouse            0
Order_Date           0
Shipment_Date        0
Lead_Time            0
Demand_Forecast      0
Inventory_Level      0
Stockout_Flag        0
Backorder_Flag       0
Supplier_ID          0
Order_Quantity       0
Shipment_Quantity    0
Product_Category     0
Product_Price        0
Customer_ID          0
Order_Priority       0
dtype: int64

Data Validation¶

In [489]:
def colcheck(col):
    print('Column DataType:', data[col].dtype)
    print("Number of Unique Values in Column:", data[col].nunique())
    print()
    print("Unique Values in Column:")
    print(data[col].unique())
    print()
    print("Number of value counts:", data[col].value_counts())

Product_Id¶

In [490]:
colcheck('Product_ID')
Column DataType: int64
Number of Unique Values in Column: 5649

Unique Values in Column:
[8270 1860 6390 ... 5377 8058 2031]

Number of value counts: Product_ID
8833    6
4083    6
4072    6
5780    6
1364    5
       ..
8817    1
3093    1
8071    1
9367    1
2031    1
Name: count, Length: 5649, dtype: int64

Warehouse¶

In [491]:
colcheck('Warehouse')
Column DataType: object
Number of Unique Values in Column: 4

Unique Values in Column:
['a' 'c' 'd' 'b']

Number of value counts: Warehouse
d    2312
a    2238
b    2235
c    2215
Name: count, dtype: int64

Order_Date¶

In [492]:
colcheck('Order_Date')
Column DataType: object
Number of Unique Values in Column: 375

Unique Values in Column:
['1/1/2023' '1/2/2023' '1/3/2023' '1/4/2023' '1/5/2023' '1/6/2023'
 '1/7/2023' '1/8/2023' '1/9/2023' '1/10/2023' '1/11/2023' '1/12/2023'
 '1/13/2023' '1/14/2023' '1/15/2023' '1/16/2023' '1/17/2023' '1/18/2023'
 '1/19/2023' '1/20/2023' '1/21/2023' '1/22/2023' '1/23/2023' '1/24/2023'
 '1/25/2023' '1/26/2023' '1/27/2023' '1/28/2023' '1/29/2023' '1/30/2023'
 '1/31/2023' '2/1/2023' '2/2/2023' '2/3/2023' '2/4/2023' '2/5/2023'
 '2/6/2023' '2/7/2023' '2/8/2023' '2/9/2023' '2/10/2023' '2/11/2023'
 '2/12/2023' '2/13/2023' '2/14/2023' '2/15/2023' '2/16/2023' '2/17/2023'
 '2/18/2023' '2/19/2023' '2/20/2023' '2/21/2023' '2/22/2023' '2/23/2023'
 '2/24/2023' '2/25/2023' '2/26/2023' '2/27/2023' '2/28/2023' '3/1/2023'
 '3/2/2023' '3/3/2023' '3/4/2023' '3/5/2023' '3/6/2023' '3/7/2023'
 '3/8/2023' '3/9/2023' '3/10/2023' '3/11/2023' '3/12/2023' '3/13/2023'
 '3/14/2023' '3/15/2023' '3/16/2023' '3/17/2023' '3/18/2023' '3/19/2023'
 '3/20/2023' '3/21/2023' '3/22/2023' '3/23/2023' '3/24/2023' '3/25/2023'
 '3/26/2023' '3/27/2023' '3/28/2023' '3/29/2023' '3/30/2023' '3/31/2023'
 '4/1/2023' '4/2/2023' '4/3/2023' '4/4/2023' '4/5/2023' '4/6/2023'
 '4/7/2023' '4/8/2023' '4/9/2023' '4/10/2023' '4/11/2023' '4/12/2023'
 '4/13/2023' '4/14/2023' '4/15/2023' '4/16/2023' '4/17/2023' '4/18/2023'
 '4/19/2023' '4/20/2023' '4/21/2023' '4/22/2023' '4/23/2023' '4/24/2023'
 '4/25/2023' '4/26/2023' '4/27/2023' '4/28/2023' '4/29/2023' '4/30/2023'
 '5/1/2023' '5/2/2023' '5/3/2023' '5/4/2023' '5/5/2023' '5/6/2023'
 '5/7/2023' '5/8/2023' '5/9/2023' '5/10/2023' '5/11/2023' '5/12/2023'
 '5/13/2023' '5/14/2023' '5/15/2023' '5/16/2023' '5/17/2023' '5/18/2023'
 '5/19/2023' '5/20/2023' '5/21/2023' '5/22/2023' '5/23/2023' '5/24/2023'
 '5/25/2023' '5/26/2023' '5/27/2023' '5/28/2023' '5/29/2023' '5/30/2023'
 '5/31/2023' '6/1/2023' '6/2/2023' '6/3/2023' '6/4/2023' '6/5/2023'
 '6/6/2023' '6/7/2023' '6/8/2023' '6/9/2023' '6/10/2023' '6/11/2023'
 '6/12/2023' '6/13/2023' '6/14/2023' '6/15/2023' '6/16/2023' '6/17/2023'
 '6/18/2023' '6/19/2023' '6/20/2023' '6/21/2023' '6/22/2023' '6/23/2023'
 '6/24/2023' '6/25/2023' '6/26/2023' '6/27/2023' '6/28/2023' '6/29/2023'
 '6/30/2023' '7/1/2023' '7/2/2023' '7/3/2023' '7/4/2023' '7/5/2023'
 '7/6/2023' '7/7/2023' '7/8/2023' '7/9/2023' '7/10/2023' '7/11/2023'
 '7/12/2023' '7/13/2023' '7/14/2023' '7/15/2023' '7/16/2023' '7/17/2023'
 '7/18/2023' '7/19/2023' '7/20/2023' '7/21/2023' '7/22/2023' '7/23/2023'
 '7/24/2023' '7/25/2023' '7/26/2023' '7/27/2023' '7/28/2023' '7/29/2023'
 '7/30/2023' '7/31/2023' '8/1/2023' '8/2/2023' '8/3/2023' '8/4/2023'
 '8/5/2023' '8/6/2023' '8/7/2023' '8/8/2023' '8/9/2023' '8/10/2023'
 '8/11/2023' '8/12/2023' '8/13/2023' '8/14/2023' '8/15/2023' '8/16/2023'
 '8/17/2023' '8/18/2023' '8/19/2023' '8/20/2023' '8/21/2023' '8/22/2023'
 '8/23/2023' '8/24/2023' '8/25/2023' '8/26/2023' '8/27/2023' '8/28/2023'
 '8/29/2023' '8/30/2023' '8/31/2023' '9/1/2023' '9/2/2023' '9/3/2023'
 '9/4/2023' '9/5/2023' '9/6/2023' '9/7/2023' '9/8/2023' '9/9/2023'
 '9/10/2023' '9/11/2023' '9/12/2023' '9/13/2023' '9/14/2023' '9/15/2023'
 '9/16/2023' '9/17/2023' '9/18/2023' '9/19/2023' '9/20/2023' '9/21/2023'
 '9/22/2023' '9/23/2023' '9/24/2023' '9/25/2023' '9/26/2023' '9/27/2023'
 '9/28/2023' '9/29/2023' '9/30/2023' '10/1/2023' '10/2/2023' '10/3/2023'
 '10/4/2023' '10/5/2023' '10/6/2023' '10/7/2023' '10/8/2023' '10/9/2023'
 '10/10/2023' '10/11/2023' '10/12/2023' '10/13/2023' '10/14/2023'
 '10/15/2023' '10/16/2023' '10/17/2023' '10/18/2023' '10/19/2023'
 '10/20/2023' '10/21/2023' '10/22/2023' '10/23/2023' '10/24/2023'
 '10/25/2023' '10/26/2023' '10/27/2023' '10/28/2023' '10/29/2023'
 '10/30/2023' '10/31/2023' '11/1/2023' '11/2/2023' '11/3/2023' '11/4/2023'
 '11/5/2023' '11/6/2023' '11/7/2023' '11/8/2023' '11/9/2023' '11/10/2023'
 '11/11/2023' '11/12/2023' '11/13/2023' '11/14/2023' '11/15/2023'
 '11/16/2023' '11/17/2023' '11/18/2023' '11/19/2023' '11/20/2023'
 '11/21/2023' '11/22/2023' '11/23/2023' '11/24/2023' '11/25/2023'
 '11/26/2023' '11/27/2023' '11/28/2023' '11/29/2023' '11/30/2023'
 '12/1/2023' '12/2/2023' '12/3/2023' '12/4/2023' '12/5/2023' '12/6/2023'
 '12/7/2023' '12/8/2023' '12/9/2023' '12/10/2023' '12/11/2023'
 '12/12/2023' '12/13/2023' '12/14/2023' '12/15/2023' '12/16/2023'
 '12/17/2023' '12/18/2023' '12/19/2023' '12/20/2023' '12/21/2023'
 '12/22/2023' '12/23/2023' '12/24/2023' '12/25/2023' '12/26/2023'
 '12/27/2023' '12/28/2023' '12/29/2023' '12/30/2023' '12/31/2023'
 '1/1/2024' '1/2/2024' '1/3/2024' '1/4/2024' '1/5/2024' '1/6/2024'
 '1/7/2024' '1/8/2024' '1/9/2024' '1/10/2024']

Number of value counts: Order_Date
1/1/2023     24
9/5/2023     24
9/14/2023    24
9/13/2023    24
9/12/2023    24
             ..
5/4/2023     24
5/3/2023     24
5/2/2023     24
5/1/2023     24
1/10/2024    24
Name: count, Length: 375, dtype: int64
In [493]:
data['Order_Date']=pd.to_datetime(data['Order_Date'])
In [494]:
data['Order_Date']
Out[494]:
0      2023-01-01
1      2023-01-01
2      2023-01-01
3      2023-01-01
4      2023-01-01
          ...    
8995   2024-01-10
8996   2024-01-10
8997   2024-01-10
8998   2024-01-10
8999   2024-01-10
Name: Order_Date, Length: 9000, dtype: datetime64[ns]

Shipment_Date¶

In [495]:
colcheck('Shipment_Date')
Column DataType: object
Number of Unique Values in Column: 375

Unique Values in Column:
['1/2/2023' '1/3/2023' '1/4/2023' '1/5/2023' '1/6/2023' '1/7/2023'
 '1/8/2023' '1/9/2023' '1/10/2023' '1/11/2023' '1/12/2023' '1/13/2023'
 '1/14/2023' '1/15/2023' '1/16/2023' '1/17/2023' '1/18/2023' '1/19/2023'
 '1/20/2023' '1/21/2023' '1/22/2023' '1/23/2023' '1/24/2023' '1/25/2023'
 '1/26/2023' '1/27/2023' '1/28/2023' '1/29/2023' '1/30/2023' '1/31/2023'
 '2/1/2023' '2/2/2023' '2/3/2023' '2/4/2023' '2/5/2023' '2/6/2023'
 '2/7/2023' '2/8/2023' '2/9/2023' '2/10/2023' '2/11/2023' '2/12/2023'
 '2/13/2023' '2/14/2023' '2/15/2023' '2/16/2023' '2/17/2023' '2/18/2023'
 '2/19/2023' '2/20/2023' '2/21/2023' '2/22/2023' '2/23/2023' '2/24/2023'
 '2/25/2023' '2/26/2023' '2/27/2023' '2/28/2023' '3/1/2023' '3/2/2023'
 '3/3/2023' '3/4/2023' '3/5/2023' '3/6/2023' '3/7/2023' '3/8/2023'
 '3/9/2023' '3/10/2023' '3/11/2023' '3/12/2023' '3/13/2023' '3/14/2023'
 '3/15/2023' '3/16/2023' '3/17/2023' '3/18/2023' '3/19/2023' '3/20/2023'
 '3/21/2023' '3/22/2023' '3/23/2023' '3/24/2023' '3/25/2023' '3/26/2023'
 '3/27/2023' '3/28/2023' '3/29/2023' '3/30/2023' '3/31/2023' '4/1/2023'
 '4/2/2023' '4/3/2023' '4/4/2023' '4/5/2023' '4/6/2023' '4/7/2023'
 '4/8/2023' '4/9/2023' '4/10/2023' '4/11/2023' '4/12/2023' '4/13/2023'
 '4/14/2023' '4/15/2023' '4/16/2023' '4/17/2023' '4/18/2023' '4/19/2023'
 '4/20/2023' '4/21/2023' '4/22/2023' '4/23/2023' '4/24/2023' '4/25/2023'
 '4/26/2023' '4/27/2023' '4/28/2023' '4/29/2023' '4/30/2023' '5/1/2023'
 '5/2/2023' '5/3/2023' '5/4/2023' '5/5/2023' '5/6/2023' '5/7/2023'
 '5/8/2023' '5/9/2023' '5/10/2023' '5/11/2023' '5/12/2023' '5/13/2023'
 '5/14/2023' '5/15/2023' '5/16/2023' '5/17/2023' '5/18/2023' '5/19/2023'
 '5/20/2023' '5/21/2023' '5/22/2023' '5/23/2023' '5/24/2023' '5/25/2023'
 '5/26/2023' '5/27/2023' '5/28/2023' '5/29/2023' '5/30/2023' '5/31/2023'
 '6/1/2023' '6/2/2023' '6/3/2023' '6/4/2023' '6/5/2023' '6/6/2023'
 '6/7/2023' '6/8/2023' '6/9/2023' '6/10/2023' '6/11/2023' '6/12/2023'
 '6/13/2023' '6/14/2023' '6/15/2023' '6/16/2023' '6/17/2023' '6/18/2023'
 '6/19/2023' '6/20/2023' '6/21/2023' '6/22/2023' '6/23/2023' '6/24/2023'
 '6/25/2023' '6/26/2023' '6/27/2023' '6/28/2023' '6/29/2023' '6/30/2023'
 '7/1/2023' '7/2/2023' '7/3/2023' '7/4/2023' '7/5/2023' '7/6/2023'
 '7/7/2023' '7/8/2023' '7/9/2023' '7/10/2023' '7/11/2023' '7/12/2023'
 '7/13/2023' '7/14/2023' '7/15/2023' '7/16/2023' '7/17/2023' '7/18/2023'
 '7/19/2023' '7/20/2023' '7/21/2023' '7/22/2023' '7/23/2023' '7/24/2023'
 '7/25/2023' '7/26/2023' '7/27/2023' '7/28/2023' '7/29/2023' '7/30/2023'
 '7/31/2023' '8/1/2023' '8/2/2023' '8/3/2023' '8/4/2023' '8/5/2023'
 '8/6/2023' '8/7/2023' '8/8/2023' '8/9/2023' '8/10/2023' '8/11/2023'
 '8/12/2023' '8/13/2023' '8/14/2023' '8/15/2023' '8/16/2023' '8/17/2023'
 '8/18/2023' '8/19/2023' '8/20/2023' '8/21/2023' '8/22/2023' '8/23/2023'
 '8/24/2023' '8/25/2023' '8/26/2023' '8/27/2023' '8/28/2023' '8/29/2023'
 '8/30/2023' '8/31/2023' '9/1/2023' '9/2/2023' '9/3/2023' '9/4/2023'
 '9/5/2023' '9/6/2023' '9/7/2023' '9/8/2023' '9/9/2023' '9/10/2023'
 '9/11/2023' '9/12/2023' '9/13/2023' '9/14/2023' '9/15/2023' '9/16/2023'
 '9/17/2023' '9/18/2023' '9/19/2023' '9/20/2023' '9/21/2023' '9/22/2023'
 '9/23/2023' '9/24/2023' '9/25/2023' '9/26/2023' '9/27/2023' '9/28/2023'
 '9/29/2023' '9/30/2023' '10/1/2023' '10/2/2023' '10/3/2023' '10/4/2023'
 '10/5/2023' '10/6/2023' '10/7/2023' '10/8/2023' '10/9/2023' '10/10/2023'
 '10/11/2023' '10/12/2023' '10/13/2023' '10/14/2023' '10/15/2023'
 '10/16/2023' '10/17/2023' '10/18/2023' '10/19/2023' '10/20/2023'
 '10/21/2023' '10/22/2023' '10/23/2023' '10/24/2023' '10/25/2023'
 '10/26/2023' '10/27/2023' '10/28/2023' '10/29/2023' '10/30/2023'
 '10/31/2023' '11/1/2023' '11/2/2023' '11/3/2023' '11/4/2023' '11/5/2023'
 '11/6/2023' '11/7/2023' '11/8/2023' '11/9/2023' '11/10/2023' '11/11/2023'
 '11/12/2023' '11/13/2023' '11/14/2023' '11/15/2023' '11/16/2023'
 '11/17/2023' '11/18/2023' '11/19/2023' '11/20/2023' '11/21/2023'
 '11/22/2023' '11/23/2023' '11/24/2023' '11/25/2023' '11/26/2023'
 '11/27/2023' '11/28/2023' '11/29/2023' '11/30/2023' '12/1/2023'
 '12/2/2023' '12/3/2023' '12/4/2023' '12/5/2023' '12/6/2023' '12/7/2023'
 '12/8/2023' '12/9/2023' '12/10/2023' '12/11/2023' '12/12/2023'
 '12/13/2023' '12/14/2023' '12/15/2023' '12/16/2023' '12/17/2023'
 '12/18/2023' '12/19/2023' '12/20/2023' '12/21/2023' '12/22/2023'
 '12/23/2023' '12/24/2023' '12/25/2023' '12/26/2023' '12/27/2023'
 '12/28/2023' '12/29/2023' '12/30/2023' '12/31/2023' '1/1/2024' '1/2/2024'
 '1/3/2024' '1/4/2024' '1/5/2024' '1/6/2024' '1/7/2024' '1/8/2024'
 '1/9/2024' '1/10/2024' '1/11/2024']

Number of value counts: Shipment_Date
1/2/2023     24
9/6/2023     24
9/15/2023    24
9/14/2023    24
9/13/2023    24
             ..
5/5/2023     24
5/4/2023     24
5/3/2023     24
5/2/2023     24
1/11/2024    24
Name: count, Length: 375, dtype: int64
In [496]:
data['Shipment_Date'] = pd.to_datetime(data['Shipment_Date'])
In [497]:
data['Shipment_Date']
Out[497]:
0      2023-01-02
1      2023-01-02
2      2023-01-02
3      2023-01-02
4      2023-01-02
          ...    
8995   2024-01-11
8996   2024-01-11
8997   2024-01-11
8998   2024-01-11
8999   2024-01-11
Name: Shipment_Date, Length: 9000, dtype: datetime64[ns]

Lead_Time¶

In [498]:
colcheck('Lead_Time')
Column DataType: int64
Number of Unique Values in Column: 9

Unique Values in Column:
[8 5 3 7 6 1 4 9 2]

Number of value counts: Lead_Time
2    1037
4    1018
6    1014
7    1012
1    1004
3    1001
8     992
9     967
5     955
Name: count, dtype: int64

Demand_Forecast¶

In [499]:
colcheck('Demand_Forecast')
Column DataType: int64
Number of Unique Values in Column: 900

Unique Values in Column:
[839 351 398 972 529 103 752 777 353 764 678 666 195 266 932 430 997 987
 953 161 273 352 560 723 314 338 514 838 841 844 774 225 944 810 296 733
 512 555 875 153 746 438 458 250 199 333 643 295 641 854 205 394 744 550
 633 255 505 995 825 837 896 782 653 671 474 688 554 301 831 551 286 707
 823 467 439 361 940 642 650 405 945 737 264 857 720 187 299 370 150 588
 297 122 322 320 762 365 172 980 176 534 164 456 117 775 898 665 674 868
 850 535 540 108 569 731 447 701 471 923 274 614 229 392 795 452 323 623
 806 858 608 765 689 751 599 189 755 171 532 788 362 607 716 519 173 754
 602 673 245 145 840 204 208 416 220 598 142 198 805 410 313 562 522 488
 796 211 958 371 924 201 309 892 121 859 258 990 442 499 553 798 242 949
 969 496 635 240 339 391 916 181 659 964 270 873 679 625 209 311 537 421
 999 813 183 475 152 966 595 355 685 319 228 318 576 882 557 789 728 428
 790 579 884 973 971 278 520 581 148 926 866 129 184 888 601 645 709 533
 687 786 952 348 951 233 464 879 524 869 611 484 487 914 548 663 402 656
 141 567 539 426 668 132 511 725 911 213 155 655 451 369 736 178 756 182
 236 638 818 824 965 648 605 771 528 979 780 411 179 113 546 865 669 632
 359 684 347 226 367 739 193 835 415 711 170 692 466 719 713 509 190 976
 627 874 864 188 706 346 460 477 305 444 929 897 166 657 328 912 543 298
 596 947 131 363 212 558 375 105 593 207 871 773 343 779 960 316 933 828
 819 691 991 154 616 547 134 218 870 954 977 613 693 961 521 785 238 804
 749 908 727 143 957 900 667 853 357 612 476 992 388 202 903 248 125 834
 740 753 445 401 397 234 473 930 556 454 231 592 963 681 895 959 395 909
 985 872 200 672 282 978 817 791 203 776 565 217 191 584 186 223 302 621
 826 507 157 590 419 118 216 235 938 624 479 254 699 741 883 227 500 628
 649 459 163 988 660 376 326 943 256 894 489 462 563 498 504 925 682 441
 852 962 515 396 847 905 842 793 393 549 544 159 491 910 766 784 634 587
 820 221 931 252 407 942 680 696 381 307 651 861 472 747 432 492 809 846
 891 717 494 372 495 851 306 856 704 904 561 366 308 934 247 939 768 552
 718 383 287 573 606 232 721 536 760 697 803 281 618 127 382 862 799 482
 418 937 710 292 501 743 571 878 745 989 797 708 502 399 450 446 722 508
 920 408 276 526 630 517 315 812 506 344 165 327 981 293 757 683 453 575
 115 177 151 603 843 160 481 732 194 787 622 144 901 304 830 530 140 889
 294 434 175 886 974 329 261 470 662 100 734 899 497 448 102 158 431 268
 948 968 620 604 403 214 763 527 885 640 814 101 423 197 778 436 698 821
 404 589 935 545 377 750 331 525 902 133 863 224 646 412 586 111 758 335
 670 342 568 324 829 686 336 822 827 317 531 724 577 468 735 726 387 110
 350 427 437 652 890 594 770 332 816 288 107 192 260 585 970 167 836 263
 378 767 781 271 457 112 772 570 259 738 360 354 379 986 759 617 636 463
 429 443 967 919 742 993 300 341 337 106 867 321 280 262 123 135 386 998
 285 887 210 928 130 615 729 609 275 222 330 654 801 915 349 257 676 936
 128 705 848 907 631 461 291 440 239 406 433 493 120 647 303 345 833 267
 893 518 610 156 730 237 982 485 950 661 802 422 138 510 832 523 374 906
 283 700 860 715 373 277 358 174 253 639 243 714 413 658 168 114 955 384
 390 241 566 677 946 483 975 162 503 480 417 807 368 139 849 513 380 855
 800 230 455 146 104 921 761 811 289 185 119 409 572 124 808 792 284 712
 356 996 312 917 994 815 703 538 695 591 927 578 675 389 136 340 310 249
 983 465 748 769 881 196 600 564 541 626 637 169 542 580 449 597 877 215
 941 147 984 149 325 582 385 126 486 425 574 279 251 876 583 794 913 880
 206 400 629 272 290 783 478 269 420 490 845 334 364 644 219 702 469 246
 559 244 922 690 109 116 956 516 664 137 414 424 918 265 180 694 435 619]

Number of value counts: Demand_Forecast
442    25
608    23
324    21
498    20
456    20
       ..
883     4
808     3
389     3
547     3
918     3
Name: count, Length: 900, dtype: int64

Inventory_Level¶

In [500]:
colcheck('Inventory_Level')
Column DataType: int64
Number of Unique Values in Column: 500

Unique Values in Column:
[481 332 156 418 428 457 221 363 345  98 219 412 243 453 258 476 440 193
 374 149 493 366 497 154 498  21 115 112 432 210  23 282 291 362 400  55
 265  53 336 107 201 421 382 473 217 242 353 387 177 474  60 404  35 357
  45  94 326  43 319 225 145 471 268  12  24 260 127  38 184 173 416 393
 185 280 488  73 294  15 187 411 459 171  31  62 348 300 308 188 305 410
  25 386 192 162  52 195 277 389 307 456 330 213 494 328  28 170 121 417
 158  85 301   3   1 132  78 146 165 314 462 237 327 275 203 451 485 106
  54 245 422 376 163 441  14   0 350 136 150 179 409 231 491 486  17 224
 104 205  64 371 206 228  68 141 196 415 306 227 270 436 198 324 477 285
 333 407 215  34 395 211 381 452 247 419  84  47 230  66 437  88 218  51
 445 448 130  87 135 495 373 335 223 116 383 435  11 293 340 168  75 402
  70  91 463 208 341 338 175  18 354  46 344  59 254 403 297 238 331 398
 323 334  74 182  86 414 244   6 499 241 379 420 125 355 189  20 464  32
 181 299 413 406 466 271 438 272 458 172   8 496 292  71 450 479 122  90
 251 408 478 255 439 117 434 460 160 367 313 167   2 129  22  97 157   9
 320 183 216 209 105 118 137 119 207 317 337 262 329 312  13  76 454 250
 295 139 475 128 304  95  27 229 455 174  29 164 472 101 372   7 352 100
 110 342 461 200  63 126  48 385 392 176 202 284 288  82 111 153 369 394
 212 359 303 199 166 279 465 220 484  26 388 442 197 490 470 151  37 467
 431  83 138 315 134 399 236 310 375 339 360 235 246 234 469 147  92 226
 318  30 113 390 283  16 214 120   5 267 102 103 290 347 204 423  81 286
  36 114 131 365 281 144 123 316 396  10 447 377 278 370 482 169 384 190
 433  33  50  77 325  80  56 186 302 430  79 446  42 124  39 289 259 191
 368 133 351 405 449 311 248 232 429  67 480 152 239 276 483 108 180 140
 273  61 361 264 142  93  19  44 346 159 427  99 298 492  72   4  69 296
 378 349 444 358 364 109 222 257 397 391 194 426 148 489 356 322 263  65
  58 380  89 266 443 487 261 287 143 269 425 249 401 240 424 343 233  41
 468 321  49  57 253 309  40 256 178 252 161 274 155  96]

Number of value counts: Inventory_Level
437    32
367    31
446    31
229    31
17     31
       ..
343     8
257     8
290     8
96      8
191     5
Name: count, Length: 500, dtype: int64

Stockout_Flag¶

In [501]:
colcheck('Stockout_Flag')
Column DataType: int64
Number of Unique Values in Column: 3

Unique Values in Column:
[0 1 2]

Number of value counts: Stockout_Flag
0    7204
1    1779
2      17
Name: count, dtype: int64

Backorder_Flag¶

In [502]:
colcheck('Backorder_Flag')
Column DataType: int64
Number of Unique Values in Column: 3

Unique Values in Column:
[0 1 2]

Number of value counts: Backorder_Flag
0    7588
1    1386
2      26
Name: count, dtype: int64

Supplier_ID¶

In [503]:
colcheck('Supplier_ID')
Column DataType: int64
Number of Unique Values in Column: 400

Unique Values in Column:
[397 483 160 161 325 354 352 376 258 338 142 462 413 251 400 179 236 312
 485 264 482 296 401 361 147 226 467 235 131 200 176 294 300 167 127 499
 156 302 154 182 205 350 441 431 459 232 456 291 197 367 206 165 151 298
 387 331 262 340 489 121 316 274 496 378 271 108 461 158 216 344 385 439
 399 276 115 220 313 329 429 183 280 332 422 141 491 172 447 470 452 287
 389 487 339 472 417 195 175 101 277 211 303 337 125 440 207 455 373 379
 129 116 180 463 495 126 282 103 171 391 359 110 358 301 212 335 105 188
 213 464 434 420 410 267 437 475 450 256 246 477 222 386 281 198 289 497
 405 414 297 346 364 382 157 436 423 285 435 345 348 102 150 204 270 254
 245 334 498 451 356 124 394 192 149 318 255 330 306 371 314 283 390 166
 310 173 404 117 218 342 202 307 248 292 412 321 146 403 266 186 219 347
 243 449 336 416 357 421 395 230 169 284 148 411 133 396 109 474 448 119
 252 479 384 215 349 144 393 170 442 104 196 100 185 164 123 250 402 490
 106 203 181 465 480 319 134 261 458 460 177 457 453 135 494 454 229 187
 419 369 225 426 107 241 311 383 231 493 201 174 199 368 353 398 428 469
 327 476 244 138 293 432 377 351 341 278 111 392 184 305 112 140 139 409
 233 415 486 295 478 430 365 273 317 473 152 374 168 239 407 189 114 290
 304 259 444 162 253 159 355 137 468 425 279 388 191 210 209 224 370 122
 299 443 263 268 269 113 118 208 362 471 309 424 328 194 217 136 286 408
 406 130 308 214 227 242 333 326 132 427 257 163 145 445 481 153 275 323
 380 234 223 240 381 190 372 466 128 247 249 238 120 488 265 492 288 375
 178 418 446 324 366 237 484 322 193 360 315 260 343 363 143 320 438 221
 155 272 433 228]

Number of value counts: Supplier_ID
414    36
388    34
342    34
357    33
483    32
       ..
224    12
132    12
191    11
216    11
221    10
Name: count, Length: 400, dtype: int64

Order_Quantity¶

In [504]:
colcheck('Order_Quantity')
Column DataType: int64
Number of Unique Values in Column: 99

Unique Values in Column:
[98 52 34 32 44  4 48 16 60 47 75 15 94 56 91  1 25 64 92 43 14 68 80 77
 17 10  8  2 65 55 69 99 86 73 36  9 74 22 41 67 28 29 11 26 31 20 84 97
 95  7 54 63 82 13 33 30 70 21  5 93 35 12 37 87 42 88 76 50 18 81 85 90
 19 89 59 96 53 58 57 61 83 24 38 46  6 45 62 79 72 66 23 39 71 78 49 40
  3 27 51]

Number of value counts: Order_Quantity
98    116
2     112
4     108
80    107
23    107
     ... 
68     74
50     74
47     74
49     73
17     72
Name: count, Length: 99, dtype: int64

Shipment_Quantity¶

In [505]:
colcheck('Shipment_Quantity')
Column DataType: int64
Number of Unique Values in Column: 99

Unique Values in Column:
[40 35 43 45 41 64 15 44 42 59 85  4 71 29  6 39 81 66 93 27  7 11 90 32
 87 51  1 25 95 92 16 22 18 50 72 10 49 70 58 99 28 89 73  5 57 80 55 23
 26  8 19 68 84 86 76 65 60 33 97 46 37 69 94 67 63  3 52 14 62 56 82  9
  2 96 31 47 91 53 75 36 38 74 78 79 48 20 88 30 54 83 61 98 34 13 77 12
 17 21 24]

Number of value counts: Shipment_Quantity
93    115
69    112
4     112
90    110
48    110
     ... 
21     75
43     74
24     71
68     68
99     64
Name: count, Length: 99, dtype: int64

Product_Category¶

In [506]:
colcheck('Product_Category')
Column DataType: object
Number of Unique Values in Column: 4

Unique Values in Column:
['clothing' 'furniture' 'food' 'electronics']

Number of value counts: Product_Category
electronics    2278
clothing       2275
furniture      2255
food           2192
Name: count, dtype: int64

Product_Price¶

In [507]:
colcheck('Product_Price')
Column DataType: float64
Number of Unique Values in Column: 8594

Unique Values in Column:
[387.89 891.03 933.21 ... 116.94 352.3  361.41]

Number of value counts: Product_Price
455.32    3
763.14    3
881.29    3
945.83    3
154.44    3
         ..
583.86    1
958.31    1
636.28    1
350.80    1
361.41    1
Name: count, Length: 8594, dtype: int64

Customer_ID¶

In [508]:
colcheck('Customer_ID')
Column DataType: int64
Number of Unique Values in Column: 5703

Unique Values in Column:
[2014 6805 3833 ... 3037 4953 9232]

Number of value counts: Customer_ID
5236    6
8817    6
8956    6
1359    6
8760    5
       ..
2154    1
5328    1
3593    1
4437    1
9232    1
Name: count, Length: 5703, dtype: int64

Order_Priority¶

In [509]:
colcheck('Order_Priority')
Column DataType: object
Number of Unique Values in Column: 3

Unique Values in Column:
['high' 'medium' 'low']

Number of value counts: Order_Priority
medium    3007
low       3002
high      2991
Name: count, dtype: int64
In [510]:
data.head()
Out[510]:
Product_ID Warehouse Order_Date Shipment_Date Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Supplier_ID Order_Quantity Shipment_Quantity Product_Category Product_Price Customer_ID Order_Priority
0 8270 a 2023-01-01 2023-01-02 8 839 481 0 0 397 98 40 clothing 387.89 2014 high
1 1860 c 2023-01-01 2023-01-02 8 351 332 0 1 483 52 35 furniture 891.03 6805 high
2 6390 c 2023-01-01 2023-01-02 5 398 156 0 0 160 34 43 food 933.21 3833 medium
3 6191 a 2023-01-01 2023-01-02 5 972 418 1 0 161 32 45 electronics 628.57 3640 medium
4 6734 d 2023-01-01 2023-01-02 3 529 428 0 0 325 44 41 electronics 70.33 9297 medium
In [511]:
vdata = data[['Order_Date','Shipment_Date','Lead_Time','Demand_Forecast','Inventory_Level','Stockout_Flag','Backorder_Flag','Order_Quantity','Shipment_Quantity','Product_Price','Order_Priority']]
In [512]:
vdata
Out[512]:
Order_Date Shipment_Date Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price Order_Priority
0 2023-01-01 2023-01-02 8 839 481 0 0 98 40 387.89 high
1 2023-01-01 2023-01-02 8 351 332 0 1 52 35 891.03 high
2 2023-01-01 2023-01-02 5 398 156 0 0 34 43 933.21 medium
3 2023-01-01 2023-01-02 5 972 418 1 0 32 45 628.57 medium
4 2023-01-01 2023-01-02 3 529 428 0 0 44 41 70.33 medium
... ... ... ... ... ... ... ... ... ... ... ...
8995 2024-01-10 2024-01-11 4 426 31 0 0 22 15 83.02 low
8996 2024-01-10 2024-01-11 8 172 432 0 0 11 45 116.94 medium
8997 2024-01-10 2024-01-11 9 355 414 0 0 36 2 352.30 medium
8998 2024-01-10 2024-01-11 4 842 107 0 0 53 58 361.41 medium
8999 2024-01-10 2024-01-11 6 269 78 0 0 76 38 978.42 medium

9000 rows × 11 columns

EDA (Stats + Visual)¶

EDA¶

  • EDA is the process of data understanding on Validated data

  • In statistics, Exploratory Data Analysis (EDA) is an approach of analyzing data sets to summarize their main characteristics, often using statistical graphics and other data visualization methods.

  • EDA will follow two concepts,

    • Stats
    • Descriptive
    • Visual Analysis
In [513]:
vdata.head(2)
Out[513]:
Order_Date Shipment_Date Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price Order_Priority
0 2023-01-01 2023-01-02 8 839 481 0 0 98 40 387.89 high
1 2023-01-01 2023-01-02 8 351 332 0 1 52 35 891.03 high
In [514]:
vdata.dtypes
Out[514]:
Order_Date           datetime64[ns]
Shipment_Date        datetime64[ns]
Lead_Time                     int64
Demand_Forecast               int64
Inventory_Level               int64
Stockout_Flag                 int64
Backorder_Flag                int64
Order_Quantity                int64
Shipment_Quantity             int64
Product_Price               float64
Order_Priority               object
dtype: object
In [515]:
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm


from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
import plotly.express as px

from simple_colors import *

import warnings
warnings.filterwarnings("ignore")

Identifying the variables types according to stats¶

Numerical Categorical
Lead_Time Order_Priority
Demand_Forecast X
Inventory_Level X
Stockout_Flag X
Backorder_Flag X
Order_Quantity X
Shipment_Quantity X
Product_Price X

Analysis¶

EDA can be further divided into three types of anal ysis Uni-Variate|Bi-Variate|Multi-Variate ---|---|---- Data study of single column|Data study between two columns|Data Study b/w three or more columns

Uni - Variate Analysis¶

  • Descriptive Stats Univariate measures,
    • Numeric
      • Discrete
        • round(Mean), round(Median), Mode, Five Number Summary, Std, Skewness, Kurtosis
      • Continuous
          • Mean, Median, Five Number Summary, Std, Skewness, Kurtosis
    • Categorical & Boolean
      • nunique
      • unique
      • FDT
      • Mode
In [516]:
def univariate(vdata):
    for col in vdata.columns:
        
        if vdata[col].dtype == 'object':
            print(green("Column Name:", ['bold']), col)
            print(blue("=============================================================", ['bold']))
            print()
            print(blue("*", ['bold']))
            print(black("          Descriptive Stats              ", ['bold']))
            print(blue("*", ['bold']))
            print()

            # pandas inbuilt functions for unique, nunique, fdt
            print(magenta("Number of Unique Classes:", ['bold']), vdata[col].nunique())
            print()
            print(magenta("Unique Classes:", ['bold']))
            print(vdata[col].unique())
            print()
            print(magenta("Value Counts of each class (FDT):", ['bold']))
            print(vdata[col].value_counts())  # FDT
            print()
            print(magenta("Each Class percentage:", ['bold']))
            print(round((vdata[col].value_counts() / len(vdata)) * 100,2))  # FDT Percentage
            print()
            print(magenta("Mode value:", ['bold']), vdata[col].mode()[0])
            print()
            print(cyan("*", ['bold']))
            print(black("      Visual Analysis - Pie Chart      ", ['bold']))
            print(cyan("*", ['bold']))
            print()
            print(black("Top Catgeories:", ['bold']))
            
            # Considering only top 10 categories for pie chart
            index = vdata[col].value_counts().sort_values(ascending=False)[0:10].index
            vals = vdata[col].value_counts().sort_values(ascending=False)[0:10].values
            fig = px.pie(names=index, values=vals, width=700, height=400)
            fig.update_traces(textposition='inside', textinfo='percent+label')
            fig.update_layout(showlegend=False)
            fig.show()
            print()
            

        elif vdata[col].dtype == 'int32' or vdata[col].dtype == 'int64':
            print(green("Column Name:", ['bold']), col)
            print(blue("============================================================"))
            print()
            print(blue("*", ['bold']))
            print(black("          Descriptive Stats              ", ['bold']))
            print(blue("*", ['bold']))
            print()

            # pandas inbuilt functions for the mct, ms stats

            # Measure of Central Tendancy
            print(magenta("Mean:", ['bold']), round(vdata[col].mean()))
            print(magenta("Median:", ['bold']), round(vdata[col].median()))
#             print("Mode of the {} Col:".format(col),data[col].mode()[0])  # optional
            print()
            # Measure of Spread
            print(magenta("Min & Max:", ['bold']),(vdata[col].min(), vdata[col].max()))
            print(magenta("Standard Deviation:", ['bold']),round(vdata[col].std()))
            print()
            # Short Summary
            print(magenta("Summary & Quantiles:", ['bold']))
            print(round(vdata[col].describe()))
            print()
            # Measure of Symmetry
            print(magenta("Skewness:", ['bold']), round(vdata[col].skew()))
            print(magenta("Kurtosis:", ['bold']), round(vdata[col].kurt()))
            print()
            print(cyan("",['bold']))
            print(black(" Visual Analysis - Distplot (Histogram + Desnsity plot) ",['bold']))
            print(cyan("",['bold']))
            print()
            plt.figure(figsize=(4, 3))
            sns.distplot(vdata[col])
            plt.show()
            
        elif vdata[col].dtype == 'float32' or vdata[col].dtype == 'float64':
            
            print(green("Column Name:", ['bold']), col)
            print(blue("============================================================"))
            print()
            print(blue("*", ['bold']))
            print(black("          Descriptive Stats              ", ['bold']))
            print(blue("*", ['bold']))
            print()

            # pandas inbuilt functions for the mct, ms stats

            # Measure of Central Tendancy
            print(magenta("Mean:", ['bold']), round(vdata[col].mean(), 2))
            print(magenta("Median:", ['bold']), vdata[col].median())
            print(magenta("Mode of the {} Col:".format(col),vdata[col].mode()[0]))  # optional
            print()
            # Measure of Spread
            print(magenta("Min & Max:", ['bold']),(vdata[col].min(), vdata[col].max()))
            print(magenta("Standard Deviation:", ['bold']),round(vdata[col].std(), 2))
            print()
            # Short Summary
            print(magenta("Summary & Quantiles:", ['bold']))
            print(round(vdata[col].describe(), 2))
            print()
            # Measure of Symmetry
            print(magenta("Skewness:", ['bold']), round(vdata[col].skew(), 2))
            print(magenta("Kurtosis:", ['bold']), round(vdata[col].kurt(), 2))
            print()
            print(cyan("",['bold']))
            print(black(" Visual Analysis - Distplot (Histogram + Desnsity plot) ",['bold']))
            print(cyan("",['bold']))
            print()
            plt.figure(figsize=(4, 3))
            sns.distplot(vdata[col])
            plt.show()
        elif vdata[col].dtype == 'datetime64[ns]':
            print(green("Column:", ['bold']), black(col, ['bold']))
            print(blue("=============================================================", ['bold']))
            print()
            print(cyan("           Descriptive Stats        ", ['bold']))
            print(cyan("<----------------------------------->",['bold']))
            print()
            print(f"Start Date: {vdata[col].min()}")
            print(f"End Date: {vdata[col].max()}")
            print()
In [517]:
univariate(vdata)
Column: Order_Date
=============================================================

           Descriptive Stats        
<----------------------------------->

Start Date: 2023-01-01 00:00:00
End Date: 2024-01-10 00:00:00

Column: Shipment_Date
=============================================================

           Descriptive Stats        
<----------------------------------->

Start Date: 2023-01-02 00:00:00
End Date: 2024-01-11 00:00:00

Column Name: Lead_Time
============================================================

*
          Descriptive Stats              
*

Mean: 5
Median: 5

Min & Max: (1, 9)
Standard Deviation: 3

Summary & Quantiles:
count    9000.0
mean        5.0
std         3.0
min         1.0
25%         3.0
50%         5.0
75%         7.0
max         9.0
Name: Lead_Time, dtype: float64

Skewness: 0
Kurtosis: -1


 Visual Analysis - Distplot (Histogram + Desnsity plot) 


No description has been provided for this image
Column Name: Demand_Forecast
============================================================

*
          Descriptive Stats              
*

Mean: 551
Median: 555

Min & Max: (100, 999)
Standard Deviation: 259

Summary & Quantiles:
count    9000.0
mean      551.0
std       259.0
min       100.0
25%       327.0
50%       555.0
75%       773.0
max       999.0
Name: Demand_Forecast, dtype: float64

Skewness: 0
Kurtosis: -1


 Visual Analysis - Distplot (Histogram + Desnsity plot) 


No description has been provided for this image
Column Name: Inventory_Level
============================================================

*
          Descriptive Stats              
*

Mean: 248
Median: 246

Min & Max: (0, 499)
Standard Deviation: 145

Summary & Quantiles:
count    9000.0
mean      248.0
std       145.0
min         0.0
25%       123.0
50%       246.0
75%       375.0
max       499.0
Name: Inventory_Level, dtype: float64

Skewness: 0
Kurtosis: -1


 Visual Analysis - Distplot (Histogram + Desnsity plot) 


No description has been provided for this image
Column Name: Stockout_Flag
============================================================

*
          Descriptive Stats              
*

Mean: 0
Median: 0

Min & Max: (0, 2)
Standard Deviation: 0

Summary & Quantiles:
count    9000.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         2.0
Name: Stockout_Flag, dtype: float64

Skewness: 2
Kurtosis: 1


 Visual Analysis - Distplot (Histogram + Desnsity plot) 


No description has been provided for this image
Column Name: Backorder_Flag
============================================================

*
          Descriptive Stats              
*

Mean: 0
Median: 0

Min & Max: (0, 2)
Standard Deviation: 0

Summary & Quantiles:
count    9000.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         2.0
Name: Backorder_Flag, dtype: float64

Skewness: 2
Kurtosis: 3


 Visual Analysis - Distplot (Histogram + Desnsity plot) 


No description has been provided for this image
Column Name: Order_Quantity
============================================================

*
          Descriptive Stats              
*

Mean: 50
Median: 51

Min & Max: (1, 99)
Standard Deviation: 29

Summary & Quantiles:
count    9000.0
mean       50.0
std        29.0
min         1.0
25%        25.0
50%        51.0
75%        76.0
max        99.0
Name: Order_Quantity, dtype: float64

Skewness: 0
Kurtosis: -1


 Visual Analysis - Distplot (Histogram + Desnsity plot) 


No description has been provided for this image
Column Name: Shipment_Quantity
============================================================

*
          Descriptive Stats              
*

Mean: 49
Median: 49

Min & Max: (1, 99)
Standard Deviation: 29

Summary & Quantiles:
count    9000.0
mean       49.0
std        29.0
min         1.0
25%        25.0
50%        49.0
75%        74.0
max        99.0
Name: Shipment_Quantity, dtype: float64

Skewness: 0
Kurtosis: -1


 Visual Analysis - Distplot (Histogram + Desnsity plot) 


No description has been provided for this image
Column Name: Product_Price
============================================================

*
          Descriptive Stats              
*

Mean: 503.12
Median: 497.55
Mode of the Product_Price Col:

Min & Max: (10.14, 999.89)
Standard Deviation: 287.87

Summary & Quantiles:
count    9000.00
mean      503.12
std       287.87
min        10.14
25%       254.50
50%       497.55
75%       755.32
max       999.89
Name: Product_Price, dtype: float64

Skewness: 0.01
Kurtosis: -1.21


 Visual Analysis - Distplot (Histogram + Desnsity plot) 


No description has been provided for this image
Column Name: Order_Priority
=============================================================

*
          Descriptive Stats              
*

Number of Unique Classes: 3

Unique Classes:
['high' 'medium' 'low']

Value Counts of each class (FDT):
Order_Priority
medium    3007
low       3002
high      2991
Name: count, dtype: int64

Each Class percentage:
Order_Priority
medium    33.41
low       33.36
high      33.23
Name: count, dtype: float64

Mode value: medium

*
      Visual Analysis - Pie Chart      
*

Top Catgeories:

Bi - Variate Analysis¶

Descriptive Stats Measures used to study data between two columns¶

Bi-Variate Combo Stats Measures
Numeric-Numeric Correlation (-1 to +1)
Numeric-Categorical Aggregation Functions (count, min, max, avg, sum)
Categorical-Categorical FDT

Notes:

  • Correlation Coeffiecient Relation Categories

           0.75 to 1 - Strong Correlation
           0.50 to 0.75 - Moderate Correlation
           <0.50 - Weak Correlation
    
In [518]:
vdata.head(2)
Out[518]:
Order_Date Shipment_Date Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price Order_Priority
0 2023-01-01 2023-01-02 8 839 481 0 0 98 40 387.89 high
1 2023-01-01 2023-01-02 8 351 332 0 1 52 35 891.03 high
In [519]:
vdata[['Lead_Time','Demand_Forecast','Inventory_Level','Stockout_Flag','Backorder_Flag','Order_Quantity','Shipment_Quantity','Product_Price']].corr()
Out[519]:
Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price
Lead_Time 1.000000 -0.012577 -0.000286 0.003119 0.004990 0.010200 -0.010591 0.007505
Demand_Forecast -0.012577 1.000000 0.022158 0.004193 -0.004262 -0.009859 0.003635 0.002716
Inventory_Level -0.000286 0.022158 1.000000 0.014796 -0.010471 0.014334 0.006217 0.000488
Stockout_Flag 0.003119 0.004193 0.014796 1.000000 0.018531 0.002791 0.010854 -0.025788
Backorder_Flag 0.004990 -0.004262 -0.010471 0.018531 1.000000 0.010820 -0.012485 0.006338
Order_Quantity 0.010200 -0.009859 0.014334 0.002791 0.010820 1.000000 -0.007902 0.027519
Shipment_Quantity -0.010591 0.003635 0.006217 0.010854 -0.012485 -0.007902 1.000000 0.002426
Product_Price 0.007505 0.002716 0.000488 -0.025788 0.006338 0.027519 0.002426 1.000000
In [520]:
sns.heatmap(vdata[['Lead_Time','Demand_Forecast','Inventory_Level','Stockout_Flag','Backorder_Flag','Order_Quantity','Shipment_Quantity','Product_Price']].corr(), annot=True, cmap='viridis')
Out[520]:
<Axes: >
No description has been provided for this image
                                                         N-N
                                                        =====
In [521]:
vdata[['Order_Quantity','Product_Price']].corr()
Out[521]:
Order_Quantity Product_Price
Order_Quantity 1.000000 0.027519
Product_Price 0.027519 1.000000
In [522]:
print(blue("====================="))
print(green("AnnualSalary vs Age",['bold']))
print(blue("====================="))
display(round(vdata.groupby('Order_Priority')['Demand_Forecast'].mean().sort_values(ascending=False),2))
print(yellow("========================================",['bold']))
display(round(vdata.groupby('Order_Priority')['Demand_Forecast'].min().sort_values(ascending=False),2))
print(yellow("========================================",['bold']))
display(round(vdata.groupby('Order_Priority')['Demand_Forecast'].max().sort_values(ascending=False),2))
print(yellow("========================================",['bold']))
=====================
AnnualSalary vs Age
=====================
Order_Priority
medium    555.00
low       554.70
high      544.37
Name: Demand_Forecast, dtype: float64
========================================
Order_Priority
low       101
high      100
medium    100
Name: Demand_Forecast, dtype: int64
========================================
Order_Priority
high      999
low       999
medium    999
Name: Demand_Forecast, dtype: int64
========================================
In [ ]:
 

Missing Values¶

In [523]:
vdata[vdata.isnull().sum(axis=1)>=1]
Out[523]:
Order_Date Shipment_Date Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price Order_Priority
In [524]:
vdata.isnull().sum()
Out[524]:
Order_Date           0
Shipment_Date        0
Lead_Time            0
Demand_Forecast      0
Inventory_Level      0
Stockout_Flag        0
Backorder_Flag       0
Order_Quantity       0
Shipment_Quantity    0
Product_Price        0
Order_Priority       0
dtype: int64

Outliers¶

In [525]:
numdata = vdata.select_dtypes(exclude="O")
In [526]:
numdata.head()
Out[526]:
Order_Date Shipment_Date Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price
0 2023-01-01 2023-01-02 8 839 481 0 0 98 40 387.89
1 2023-01-01 2023-01-02 8 351 332 0 1 52 35 891.03
2 2023-01-01 2023-01-02 5 398 156 0 0 34 43 933.21
3 2023-01-01 2023-01-02 5 972 418 1 0 32 45 628.57
4 2023-01-01 2023-01-02 3 529 428 0 0 44 41 70.33
In [527]:
from plotly.offline import iplot, init_notebook_mode
import plotly.express as px

init_notebook_mode(connected=True)
In [528]:
px.box(numdata.Demand_Forecast, orientation='h', width=1000, height=350)

Outliers Identifying¶

In [529]:
def Outlier_Identification(vdata):
    
    numdata = vdata.select_dtypes(exclude="O")

    for col in numdata.columns:
        print("Column:", col)
        print()
        # quantiles
        q1 = numdata[col].quantile(0.25) 
        q3 = numdata[col].quantile(0.75)
        # iqr
        iqr = q3-q1
        # ranges
        lowerbound = q1-(1.5*iqr)
        upperbound = q3+(1.5*iqr)

        print("********* Outlier Data Points *******")
        print()
        lowerout = []
        upperout = []

        for val in numdata[col]:
            if val<lowerbound:
                if val not in lowerout:
                    lowerout.append(val)
            elif val>upperbound:
                if val not in upperout:
                    upperout.append(val)

        lowerout.sort()
        upperout.sort()

        print("Lower Outliers:")
        print(lowerout)
        print()
        print()
        print("Upper Outliers:")
        print(upperout)
        print()
        print("===============================================")
        print()
In [530]:
Outlier_Identification(numdata)
Column: Order_Date

********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Shipment_Date

********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Lead_Time

********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Demand_Forecast

********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Inventory_Level

********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Stockout_Flag

********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[1, 2]

===============================================

Column: Backorder_Flag

********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[1, 2]

===============================================

Column: Order_Quantity

********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Shipment_Quantity

********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Product_Price

********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Outliers Detection¶

In [531]:
import plotly.express as px

def outlier_detect(vdata):
    for col in vdata.describe().columns:
        print("Column:",col)
        print("------------------------------------------------")
        print("Boxplot For Outlier Identification:")
        px.box(vdata[col], orientation='h', width=600, height=300, ).show()
        print()
        Q1 = vdata.describe().at['25%',col]
        Q3 = vdata.describe().at['75%',col]
        IQR = Q3 - Q1
        lowerbound = Q1 - 1.5 * IQR
        upperbound = Q3 + 1.5 * IQR
        
        print("********* Outlier Data Points *******")
        print()
        lowerout = []
        upperout = []

        for val in vdata[col]:
            if val<lowerbound:
                if val not in lowerout:
                    lowerout.append(val)
            elif val>upperbound:
                if val not in upperout:
                    upperout.append(val)

        lowerout.sort()
        upperout.sort()

        print("Lower Outliers:")
        print(lowerout)
        print()
        print()
        print("Upper Outliers:")
        print(upperout)
        print()
        print("===============================================")
        print()

def outlier_replacement(vdata):
    for col in vdata.describe().columns:
        print("Column:",col)
        print("------------------------------------------------")
        Q1 = vdata.describe().at['25%',col]
        Q3 = vdata.describe().at['75%',col]
        IQR = Q3 - Q1
        LTV = Q1 - 1.5 * IQR
        UTV = Q3 + 1.5 * IQR
        
        # replacement vals (any one of the below)
        
        # median
        median = vdata[col].median()
        
        # Ltv, Utv
        low_bound = LTV
        high_bound = UTV
        
        # 5th & 95th
        fifth = vdata[col].quantile(0.05)
        ninetyfifth = vdata[col].quantile(0.95)

        print("Replacing Outliers with 5th percentile for lower Outliers, 95th percentile for Upper Outliers....")
        print("Adjust the module code for any other replacements.........")
        print()
        
        # mask method is used to replace the values
        vdata[col] = vdata[col].mask(vdata[col]<LTV, fifth) # replacing the lower outlier with 5th percentile value
        vdata[col] = vdata[col].mask(vdata[col]>UTV, ninetyfifth)
In [532]:
outlier_detect(vdata)
Column: Order_Date
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Shipment_Date
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Lead_Time
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Demand_Forecast
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Inventory_Level
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Stockout_Flag
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[1, 2]

===============================================

Column: Backorder_Flag
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[1, 2]

===============================================

Column: Order_Quantity
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Shipment_Quantity
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Product_Price
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

In [533]:
outlier_replacement(vdata)
Column: Order_Date
------------------------------------------------
Replacing Outliers with 5th percentile for lower Outliers, 95th percentile for Upper Outliers....
Adjust the module code for any other replacements.........

Column: Shipment_Date
------------------------------------------------
Replacing Outliers with 5th percentile for lower Outliers, 95th percentile for Upper Outliers....
Adjust the module code for any other replacements.........

Column: Lead_Time
------------------------------------------------
Replacing Outliers with 5th percentile for lower Outliers, 95th percentile for Upper Outliers....
Adjust the module code for any other replacements.........

Column: Demand_Forecast
------------------------------------------------
Replacing Outliers with 5th percentile for lower Outliers, 95th percentile for Upper Outliers....
Adjust the module code for any other replacements.........

Column: Inventory_Level
------------------------------------------------
Replacing Outliers with 5th percentile for lower Outliers, 95th percentile for Upper Outliers....
Adjust the module code for any other replacements.........

Column: Stockout_Flag
------------------------------------------------
Replacing Outliers with 5th percentile for lower Outliers, 95th percentile for Upper Outliers....
Adjust the module code for any other replacements.........

Column: Backorder_Flag
------------------------------------------------
Replacing Outliers with 5th percentile for lower Outliers, 95th percentile for Upper Outliers....
Adjust the module code for any other replacements.........

Column: Order_Quantity
------------------------------------------------
Replacing Outliers with 5th percentile for lower Outliers, 95th percentile for Upper Outliers....
Adjust the module code for any other replacements.........

Column: Shipment_Quantity
------------------------------------------------
Replacing Outliers with 5th percentile for lower Outliers, 95th percentile for Upper Outliers....
Adjust the module code for any other replacements.........

Column: Product_Price
------------------------------------------------
Replacing Outliers with 5th percentile for lower Outliers, 95th percentile for Upper Outliers....
Adjust the module code for any other replacements.........

In [534]:
outlier_detect(vdata)
Column: Order_Date
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Shipment_Date
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Lead_Time
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Demand_Forecast
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Inventory_Level
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Stockout_Flag
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[1]

===============================================

Column: Backorder_Flag
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[1]

===============================================

Column: Order_Quantity
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Shipment_Quantity
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

Column: Product_Price
------------------------------------------------
Boxplot For Outlier Identification:
********* Outlier Data Points *******

Lower Outliers:
[]


Upper Outliers:
[]

===============================================

In [535]:
vdata.shape
Out[535]:
(9000, 11)
In [536]:
vdata.size
Out[536]:
99000
In [537]:
vdata.isnull().sum()
Out[537]:
Order_Date           0
Shipment_Date        0
Lead_Time            0
Demand_Forecast      0
Inventory_Level      0
Stockout_Flag        0
Backorder_Flag       0
Order_Quantity       0
Shipment_Quantity    0
Product_Price        0
Order_Priority       0
dtype: int64

Predictive Modeling¶

X & Y¶

In [538]:
X = vdata.drop('Order_Priority',axis=1)
y = vdata.Order_Priority
In [539]:
X
Out[539]:
Order_Date Shipment_Date Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price
0 2023-01-01 2023-01-02 8 839 481 0 0 98 40 387.89
1 2023-01-01 2023-01-02 8 351 332 0 1 52 35 891.03
2 2023-01-01 2023-01-02 5 398 156 0 0 34 43 933.21
3 2023-01-01 2023-01-02 5 972 418 1 0 32 45 628.57
4 2023-01-01 2023-01-02 3 529 428 0 0 44 41 70.33
... ... ... ... ... ... ... ... ... ... ...
8995 2024-01-10 2024-01-11 4 426 31 0 0 22 15 83.02
8996 2024-01-10 2024-01-11 8 172 432 0 0 11 45 116.94
8997 2024-01-10 2024-01-11 9 355 414 0 0 36 2 352.30
8998 2024-01-10 2024-01-11 4 842 107 0 0 53 58 361.41
8999 2024-01-10 2024-01-11 6 269 78 0 0 76 38 978.42

9000 rows × 10 columns

In [540]:
y
Out[540]:
0         high
1         high
2       medium
3       medium
4       medium
         ...  
8995       low
8996    medium
8997    medium
8998    medium
8999    medium
Name: Order_Priority, Length: 9000, dtype: object

Feature Engineering X¶

In [541]:
X = X.drop(['Order_Date', 'Shipment_Date'], axis=1)
In [542]:
X
Out[542]:
Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price
0 8 839 481 0 0 98 40 387.89
1 8 351 332 0 1 52 35 891.03
2 5 398 156 0 0 34 43 933.21
3 5 972 418 1 0 32 45 628.57
4 3 529 428 0 0 44 41 70.33
... ... ... ... ... ... ... ... ...
8995 4 426 31 0 0 22 15 83.02
8996 8 172 432 0 0 11 45 116.94
8997 9 355 414 0 0 36 2 352.30
8998 4 842 107 0 0 53 58 361.41
8999 6 269 78 0 0 76 38 978.42

9000 rows × 8 columns

  • Checking for Duplicated Rows
In [543]:
dindx = X[X.duplicated()].index
In [544]:
len(dindx)
Out[544]:
0
In [545]:
# Droping above duplicated indx from both x & y

X = X.drop(dindx, axis=0).reset_index(drop=True)
y = y.drop(dindx, axis=0).reset_index(drop=True)
In [546]:
X.head()
Out[546]:
Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price
0 8 839 481 0 0 98 40 387.89
1 8 351 332 0 1 52 35 891.03
2 5 398 156 0 0 34 43 933.21
3 5 972 418 1 0 32 45 628.57
4 3 529 428 0 0 44 41 70.33
In [547]:
y.head()
Out[547]:
0      high
1      high
2    medium
3    medium
4    medium
Name: Order_Priority, dtype: object
In [548]:
vdata.head(2)
Out[548]:
Order_Date Shipment_Date Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price Order_Priority
0 2023-01-01 2023-01-02 8 839 481 0 0 98 40 387.89 high
1 2023-01-01 2023-01-02 8 351 332 0 1 52 35 891.03 high
  • Saving above X data format for future predictions
In [549]:
X.to_excel("Supply_Chain_FinalData.xlsx", index=False)
In [551]:
X
Out[551]:
Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price
0 8 839 481 0 0 98 40 387.89
1 8 351 332 0 1 52 35 891.03
2 5 398 156 0 0 34 43 933.21
3 5 972 418 1 0 32 45 628.57
4 3 529 428 0 0 44 41 70.33
... ... ... ... ... ... ... ... ...
8995 4 426 31 0 0 22 15 83.02
8996 8 172 432 0 0 11 45 116.94
8997 9 355 414 0 0 36 2 352.30
8998 4 842 107 0 0 53 58 361.41
8999 6 269 78 0 0 76 38 978.42

9000 rows × 8 columns

  • Encoding : -converting categorical clos to numerical
In [552]:
X.select_dtypes("O")
Out[552]:
0
1
2
3
4
...
8995
8996
8997
8998
8999

9000 rows × 0 columns

In [553]:
import pandas as pd
print(pd.__version__)
2.0.3

Scaling

  • Transforming Numeric Cols data under one scale if there is a scale difference
In [554]:
X.head(2)
Out[554]:
Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price
0 8 839 481 0 0 98 40 387.89
1 8 351 332 0 1 52 35 891.03
                                                  Using Sklearn Scaler functions
In [555]:
# Module

from sklearn.preprocessing import StandardScaler

# Object Define

sc = StandardScaler()
In [556]:
# Considering only the main numeric cols, not one hot encoded cols

X.iloc[:, :]
Out[556]:
Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price
0 8 839 481 0 0 98 40 387.89
1 8 351 332 0 1 52 35 891.03
2 5 398 156 0 0 34 43 933.21
3 5 972 418 1 0 32 45 628.57
4 3 529 428 0 0 44 41 70.33
... ... ... ... ... ... ... ... ...
8995 4 426 31 0 0 22 15 83.02
8996 8 172 432 0 0 11 45 116.94
8997 9 355 414 0 0 36 2 352.30
8998 4 842 107 0 0 53 58 361.41
8999 6 269 78 0 0 76 38 978.42

9000 rows × 8 columns

In [557]:
# Applying fit_transform on selected cols

X.iloc[:, :] = sc.fit_transform(X.iloc[:, :])
In [558]:
X.head()
Out[558]:
Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price
0 1.174541 1.110190 1.605327 -0.499305 -0.431374 1.647415 -0.327234 -0.400310
1 1.174541 -0.773367 0.577940 -0.499305 2.318176 0.058842 -0.501130 1.347573
2 0.011416 -0.591959 -0.635618 -0.499305 -0.431374 -0.562773 -0.222896 1.494105
3 0.011416 1.623537 1.170928 2.002782 -0.431374 -0.631841 -0.153338 0.435800
4 -0.764001 -0.086332 1.239881 -0.499305 -0.431374 -0.217431 -0.292454 -1.503498

Final X data after feature engineering¶

In [559]:
X
Out[559]:
Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price
0 1.174541 1.110190 1.605327 -0.499305 -0.431374 1.647415 -0.327234 -0.400310
1 1.174541 -0.773367 0.577940 -0.499305 2.318176 0.058842 -0.501130 1.347573
2 0.011416 -0.591959 -0.635618 -0.499305 -0.431374 -0.562773 -0.222896 1.494105
3 0.011416 1.623537 1.170928 2.002782 -0.431374 -0.631841 -0.153338 0.435800
4 -0.764001 -0.086332 1.239881 -0.499305 -0.431374 -0.217431 -0.292454 -1.503498
... ... ... ... ... ... ... ... ...
8995 -0.376292 -0.483886 -1.497520 -0.499305 -0.431374 -0.977183 -1.196713 -1.459414
8996 1.174541 -1.464262 1.267461 -0.499305 -0.431374 -1.357059 -0.153338 -1.341577
8997 1.562249 -0.757928 1.143348 -0.499305 -0.431374 -0.493705 -1.648843 -0.523948
8998 -0.376292 1.121770 -0.973483 -0.499305 -0.431374 0.093377 0.298792 -0.492301
8999 0.399124 -1.089867 -1.173444 -0.499305 -0.431374 0.887663 -0.396792 1.651162

9000 rows × 8 columns

Train - Test Split¶

In [560]:
# Module

from sklearn.model_selection import train_test_split
In [561]:
print("Shape of X:", X.shape)  # Should print (9000, n_features)
print("Shape of y:", y.shape)  # Should print (9000,) if it's a 1D array
Shape of X: (9000, 8)
Shape of y: (9000,)
In [562]:
# Split

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=123)
In [563]:
# Checking Shapes

xtrain.shape, xtest.shape, ytrain.shape, ytest.shape
Out[563]:
((6750, 8), (2250, 8), (6750,), (2250,))
In [564]:
# For Model training

display(xtrain.head())
ytrain.head()
Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price
3803 1.174541 -0.981794 -1.173444 -0.499305 -0.431374 -1.253457 -0.953259 0.451329
7615 0.011416 0.828429 -0.139162 2.002782 -0.431374 -1.011717 0.194454 0.116231
1357 -0.764001 1.581080 1.136452 -0.499305 -0.431374 -0.631841 -1.544505 -1.650933
5627 0.011416 1.569501 1.529480 -0.499305 -0.431374 0.922197 1.446505 1.581197
1518 0.011416 -0.425990 1.529480 -0.499305 -0.431374 0.784060 1.307389 -1.313438
Out[564]:
3803      high
7615      high
1357       low
5627    medium
1518    medium
Name: Order_Priority, dtype: object
In [565]:
X.dtypes
Out[565]:
Lead_Time            float64
Demand_Forecast      float64
Inventory_Level      float64
Stockout_Flag        float64
Backorder_Flag       float64
Order_Quantity       float64
Shipment_Quantity    float64
Product_Price        float64
dtype: object
In [566]:
xtrain.columns
Out[566]:
Index(['Lead_Time', 'Demand_Forecast', 'Inventory_Level', 'Stockout_Flag',
       'Backorder_Flag', 'Order_Quantity', 'Shipment_Quantity',
       'Product_Price'],
      dtype='object')

Logistic Regression¶

In [567]:
# Sklearn module for Logistic Regression

from sklearn.linear_model import LogisticRegression

# Model Define

log_multi = LogisticRegression()  
                                                    Model Training on Train data
In [568]:
log_multi.fit(xtrain , ytrain)
Out[568]:
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
                                                        Model Params
In [569]:
log_multi.get_params()
Out[569]:
{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}
In [570]:
log_multi.coef_
Out[570]:
array([[ 0.00353743, -0.02518099, -0.03211757,  0.02614409,  0.02484145,
        -0.01340823,  0.01387192, -0.00722919],
       [-0.00339334,  0.01125341,  0.0187578 , -0.03457768, -0.01697497,
         0.00893966, -0.02357235, -0.00424925],
       [-0.00014408,  0.01392759,  0.01335977,  0.00843359, -0.00786648,
         0.00446857,  0.00970043,  0.01147844]])
In [571]:
log_multi.intercept_
Out[571]:
array([-0.01629843,  0.0192302 , -0.00293178])
In [572]:
# predictions on xtest data

ypred = log_multi.predict(xtest)
In [573]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
In [574]:
# Actual Value counts

ytest.value_counts()
Out[574]:
Order_Priority
high      776
medium    765
low       709
Name: count, dtype: int64
In [575]:
# Confusion Matrix

cm = confusion_matrix(ytest, ypred, labels=log_multi.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=log_multi.classes_)
disp.plot()
plt.figure(figsize=(3,3))
plt.show()
No description has been provided for this image
<Figure size 300x300 with 0 Axes>
In [576]:
print(classification_report(ytest, ypred))
              precision    recall  f1-score   support

        high       0.37      0.30      0.33       776
         low       0.32      0.59      0.42       709
      medium       0.35      0.15      0.21       765

    accuracy                           0.34      2250
   macro avg       0.35      0.35      0.32      2250
weighted avg       0.35      0.34      0.32      2250

                                                   Bias variance Trade off
In [577]:
from sklearn.metrics import accuracy_score

trainscore = accuracy_score(ytrain, log_multi.predict(xtrain))
testscore = accuracy_score(ytest, log_multi.predict(xtest))

print("Train  Acc Score - {}".format(round(trainscore,2)))
print("Test  Acc Score - {}".format(round(testscore,2)))
Train  Acc Score - 0.35
Test  Acc Score - 0.34

¶

K Nearest Neighbors¶

                                                         Model Define
In [578]:
# Sklearn module for Logistic Regression

from sklearn.neighbors import KNeighborsClassifier

# Model Define

knn_multi = KNeighborsClassifier(n_neighbors=5)
                                                   Model Training on train data
In [579]:
knn_multi.fit(xtrain, ytrain)
Out[579]:
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
                                                    Model Params
In [580]:
knn_multi.get_params()
Out[580]:
{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}
                                                     Model Evaluation
In [581]:
# Checking Class count of output column

y.value_counts()
Out[581]:
Order_Priority
medium    3007
low       3002
high      2991
Name: count, dtype: int64
                                             Output column is Imbalanced
In [582]:
# predictions on xtest data

ypred = knn_multi.predict(xtest)
In [583]:
from sklearn.metrics import confusion_matrix, classification_report
In [584]:
# Confusion Matrix

cm = confusion_matrix(ytest, ypred, labels=knn_multi.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=knn_multi.classes_)
disp.plot()
plt.figure(figsize=(3,3))
plt.show()
No description has been provided for this image
<Figure size 300x300 with 0 Axes>
In [585]:
# Test Scores

print(classification_report(ytest, ypred))
              precision    recall  f1-score   support

        high       0.35      0.46      0.40       776
         low       0.32      0.34      0.33       709
      medium       0.33      0.20      0.25       765

    accuracy                           0.33      2250
   macro avg       0.33      0.33      0.32      2250
weighted avg       0.33      0.33      0.32      2250

In [586]:
from sklearn.metrics import accuracy_score

trainscore = accuracy_score(ytrain, knn_multi.predict(xtrain))
testscore = accuracy_score(ytest, knn_multi.predict(xtest))

print("Train  Acc Score - {}".format(round(trainscore,2)))
print("Test  Acc Score - {}".format(round(testscore,2)))
Train  Acc Score - 0.55
Test  Acc Score - 0.33

Naive Bayes¶

                                                        Model Define                                  
In [587]:
# Sklearn module for Logistic Regression

# According to data considering multinomial naive bayes

from sklearn.naive_bayes import MultinomialNB

# Model Define

nb_multi = MultinomialNB()
                                                  Model Training on train data
In [588]:
# Check for negative values in xtrain
print("Minimum value in xtrain:", xtrain.min().min())
Minimum value in xtrain: -1.7421644337459863
In [589]:
# Replace negative values with zero
xtrain[xtrain < 0] = 0
In [590]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the training data
xtrain = scaler.fit_transform(xtrain)
In [591]:
# Apply log transformation to handle counts
xtrain = np.log1p(xtrain)  # log1p is log(1 + x)
In [592]:
nb_multi.fit(xtrain, ytrain)
Out[592]:
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MultinomialNB()
                                                    Model Params
In [593]:
knn_multi.get_params()
Out[593]:
{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}
                                                    Model Evalution
In [594]:
# Checking Class count of output column

y.value_counts()
Out[594]:
Order_Priority
medium    3007
low       3002
high      2991
Name: count, dtype: int64
                                                Output column is Imbalanced
In [595]:
# predictions on xtest data

ypred = nb_multi.predict(xtest)
In [596]:
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
In [597]:
# Actual Value counts

ytest.value_counts()
Out[597]:
Order_Priority
high      776
medium    765
low       709
Name: count, dtype: int64
In [598]:
# Confusion Matrix

cm = confusion_matrix(ytest, ypred, labels=nb_multi.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=nb_multi.classes_)
disp.plot()
plt.figure(figsize=(3,3))
plt.show()
No description has been provided for this image
<Figure size 300x300 with 0 Axes>
In [599]:
# Scores from Confusion Matrix

print(classification_report(ytest, ypred))
              precision    recall  f1-score   support

        high       0.36      0.31      0.33       776
         low       0.32      0.61      0.42       709
      medium       0.29      0.10      0.14       765

    accuracy                           0.33      2250
   macro avg       0.32      0.34      0.30      2250
weighted avg       0.33      0.33      0.29      2250

                                                    Bias variance Trade off
In [600]:
from sklearn.metrics import accuracy_score

trainscore = accuracy_score(ytrain, nb_multi.predict(xtrain))
testscore = accuracy_score(ytest, nb_multi.predict(xtest))

print("Train  Acc Score - {}".format(round(trainscore,2)))
print("Test  Acc Score - {}".format(round(testscore,2)))
Train  Acc Score - 0.35
Test  Acc Score - 0.33

Support Vector Machine¶

                                                    Model Define
In [601]:
# Sklearn module for SVM

from sklearn.svm import SVC

# Model Define

svm_multi = SVC(kernel='rbf', probability=True)
                                                Model Training on train data
In [602]:
svm_multi.fit(xtrain, ytrain)
Out[602]:
SVC(probability=True)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(probability=True)
                                                           Model Params
In [603]:
svm_multi.get_params()
Out[603]:
{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}
                                                    Model Evalution
In [604]:
# Checking Class count of output column

y.value_counts()
Out[604]:
Order_Priority
medium    3007
low       3002
high      2991
Name: count, dtype: int64
                                                  Output Column is Imbalanced
In [605]:
# predictions on xtest data

ypred = svm_multi.predict(xtest)
In [606]:
from sklearn.metrics import confusion_matrix, classification_report
In [607]:
# Actual Value counts

ytest.value_counts()
Out[607]:
Order_Priority
high      776
medium    765
low       709
Name: count, dtype: int64
In [608]:
# Confusion Matrix

cm = confusion_matrix(ytest, ypred, labels=svm_multi.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=svm_multi.classes_)
disp.plot()
plt.figure(figsize=(3,3))
plt.show()
No description has been provided for this image
<Figure size 300x300 with 0 Axes>
In [609]:
# Test Scores

print(classification_report(ytest, ypred))
              precision    recall  f1-score   support

        high       0.00      0.00      0.00       776
         low       0.32      1.00      0.48       709
      medium       1.00      0.00      0.00       765

    accuracy                           0.32      2250
   macro avg       0.44      0.33      0.16      2250
weighted avg       0.44      0.32      0.15      2250

                                                     Bias Variance Trade Off
In [610]:
from sklearn.metrics import accuracy_score

trainscore = accuracy_score(ytrain, svm_multi.predict(xtrain))
testscore = accuracy_score(ytest, svm_multi.predict(xtest))

print("Train  Acc Score - {}".format(round(trainscore,2)))
print("Test  Acc Score - {}".format(round(testscore,2)))
Train  Acc Score - 0.41
Test  Acc Score - 0.32

Decision Trees¶

                                                       Model Define
In [611]:
# Sklearn module

from sklearn.tree import DecisionTreeClassifier

# Model Define

dt_multi = DecisionTreeClassifier()  # Hyperparams: criterion, max_depth
                                                 Model Training on train data
In [612]:
dt_multi.fit(xtrain, ytrain)
Out[612]:
DecisionTreeClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
                                                      Model Params
In [613]:
dt_multi.get_params()
Out[613]:
{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}
                                                    Feature Importance
In [614]:
import sklearn
print(sklearn.__version__)
1.5.2
In [615]:
feature_names = ['Lead_Time','Demand_Forecast','Inventory_Level','Stockout_Flag','Backorder_Flag','Order_Quantity','Shipment_Quantity', 'Product_Price']  # Replace with actual feature names

for i, j in zip(feature_names, dt_multi.feature_importances_):
    print(i, ":", round(j, 2), end=", ")
Lead_Time : 0.12, Demand_Forecast : 0.15, Inventory_Level : 0.16, Stockout_Flag : 0.04, Backorder_Flag : 0.03, Order_Quantity : 0.15, Shipment_Quantity : 0.17, Product_Price : 0.2, 
                                                        Tree Visualization
In [616]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

feature_names = ['Lead_Time','Demand_Forecast','Inventory_Level','Stockout_Flag','Backorder_Flag','Order_Quantity','Shipment_Quantity', 'Product_Price']  # Replace with actual feature names
plot_tree(dt_multi, filled=True, fontsize=8, feature_names=feature_names)
plt.show()
No description has been provided for this image
In [ ]:
 
                                               Model Evalution
In [617]:
# Checking Class count of output column

y.value_counts()
Out[617]:
Order_Priority
medium    3007
low       3002
high      2991
Name: count, dtype: int64
                                             Output column is Imbalanced
In [618]:
# predictions on xtest data

ypred = dt_multi.predict(xtest)
In [619]:
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
In [620]:
# Actual Value counts

ytest.value_counts()
Out[620]:
Order_Priority
high      776
medium    765
low       709
Name: count, dtype: int64
In [621]:
# Confusion Matrix

cm = confusion_matrix(ytest, ypred, labels=dt_multi.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=dt_multi.classes_)
disp.plot()
plt.figure(figsize=(3,3))
plt.show()
No description has been provided for this image
<Figure size 300x300 with 0 Axes>
In [622]:
# Test Scores

print(classification_report(ytest, ypred))
              precision    recall  f1-score   support

        high       0.34      0.27      0.30       776
         low       0.32      0.42      0.37       709
      medium       0.31      0.29      0.30       765

    accuracy                           0.32      2250
   macro avg       0.32      0.33      0.32      2250
weighted avg       0.32      0.32      0.32      2250

                                              Bias Variance Trade Off
In [623]:
from sklearn.metrics import accuracy_score

trainscore = accuracy_score(ytrain, dt_multi.predict(xtrain))
testscore = accuracy_score(ytest, dt_multi.predict(xtest))

print("Train  Acc Score - {}".format(round(trainscore,2)))
print("Test  Acc Score - {}".format(round(testscore,2)))
Train  Acc Score - 0.97
Test  Acc Score - 0.32

Random Forest¶

                                             Model Define
In [624]:
# Sklearn module

from sklearn.ensemble import RandomForestClassifier

# Model Define

rf_multi = RandomForestClassifier()  # Hyperparams: n_estimators, criterion, max_depth
                                               Model Training on Train Data
In [625]:
rf_multi.fit(xtrain, ytrain)
Out[625]:
RandomForestClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
                                                         Model Params
In [626]:
rf_multi.get_params()
Out[626]:
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}
                                                  Feature Importance
In [627]:
feature_names = ['Lead_Time','Demand_Forecast','Inventory_Level','Stockout_Flag','Backorder_Flag','Order_Quantity','Shipment_Quantity', 'Product_Price']  # Replace with actual feature names
for i, j in zip(feature_names, rf_multi.feature_importances_):
    print(i, ":", round(j, 2), end=", ")
Lead_Time : 0.09, Demand_Forecast : 0.19, Inventory_Level : 0.18, Stockout_Flag : 0.02, Backorder_Flag : 0.02, Order_Quantity : 0.15, Shipment_Quantity : 0.16, Product_Price : 0.19, 
                                                    Tree Visualization
In [628]:
# Replace these with the actual feature names in your dataset
feature_names = ['Lead_Time','Demand_Forecast','Inventory_Level','Stockout_Flag','Backorder_Flag','Order_Quantity','Shipment_Quantity', 'Product_Price', ...]  

plt.figure(figsize=(18, 18))
plot_tree(rf_multi.estimators_[0], filled=True, fontsize=8, feature_names=feature_names)
plt.show()
No description has been provided for this image
In [ ]:
 
                                                         Model Evaluation
In [629]:
# Checking Class count of output column

y.value_counts()
Out[629]:
Order_Priority
medium    3007
low       3002
high      2991
Name: count, dtype: int64
In [630]:
# predictions on xtest data

ypred = rf_multi.predict(xtest)
In [631]:
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
In [632]:
# Actual Value counts

ytest.value_counts()
Out[632]:
Order_Priority
high      776
medium    765
low       709
Name: count, dtype: int64
In [633]:
# Confusion Matrix

cm = confusion_matrix(ytest, ypred, labels=rf_multi.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=rf_multi.classes_)
disp.plot()
plt.figure(figsize=(3,3))
plt.show()
No description has been provided for this image
<Figure size 300x300 with 0 Axes>
In [634]:
# Test Scores

print(classification_report(ytest, ypred))
              precision    recall  f1-score   support

        high       0.34      0.35      0.34       776
         low       0.31      0.38      0.34       709
      medium       0.32      0.25      0.28       765

    accuracy                           0.32      2250
   macro avg       0.32      0.32      0.32      2250
weighted avg       0.32      0.32      0.32      2250

                                                   Bias Variance Trade off
In [635]:
from sklearn.metrics import accuracy_score

trainscore = accuracy_score(ytrain, rf_multi.predict(xtrain))
testscore = accuracy_score(ytest, rf_multi.predict(xtest))

print("Train  Acc Score - {}".format(round(trainscore,2)))
print("Test  Acc Score - {}".format(round(testscore,2)))
Train  Acc Score - 0.97
Test  Acc Score - 0.32

Xgboost¶

                                                    Model define
In [636]:
# Install module  : pip install xgboost

# Loading Module

from xgboost import XGBClassifier

# Model Define

xgb_multi = XGBClassifier() # Hyperparams: multiple hyp params , consider default
                                            Model Training on Train data
In [637]:
from sklearn.preprocessing import LabelEncoder

# Initialize the encoder
le = LabelEncoder()

# Fit and transform ytrain to numerical values
ytrain_encoded = le.fit_transform(ytrain)

# Transform ytest if you have a test set
ytest_encoded = le.transform(ytest)
In [638]:
xgb_multi.fit(xtrain, ytrain_encoded)
Out[638]:
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
In [639]:
ypred = xgb_multi.predict(xtest)
ypred_labels = le.inverse_transform(ypred)  # Convert numerical predictions back to original labels
                                                         Model Params
In [640]:
print(xgb_multi.get_params())
{'objective': 'multi:softprob', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}
                                                    Feature Importance
In [641]:
# Feature Importance
feature_names = ['Lead_Time','Demand_Forecast','Inventory_Level','Stockout_Flag','Backorder_Flag','Order_Quantity','Shipment_Quantity', 'Product_Price']  # Replace with actual feature names

for i, j in zip(feature_names, xgb_multi.feature_importances_):
    print(i,":",round(j,2), end=", ")
Lead_Time : 0.12, Demand_Forecast : 0.13, Inventory_Level : 0.12, Stockout_Flag : 0.13, Backorder_Flag : 0.13, Order_Quantity : 0.12, Shipment_Quantity : 0.12, Product_Price : 0.13, 
                                              Single Tree Visualization
In [642]:
from xgboost import plot_tree
In [643]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(30, 30))
plot_tree(xgb_multi, num_trees=0, ax=ax)
plt.show()
No description has been provided for this image
                                                    Evalution Model
In [644]:
# Checking Class count of output column

y.value_counts()
Out[644]:
Order_Priority
medium    3007
low       3002
high      2991
Name: count, dtype: int64
In [645]:
# predictions on xtest data

ypred = xgb_multi.predict(xtest)
In [646]:
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
In [647]:
# Actual Value counts

ytest.value_counts()
Out[647]:
Order_Priority
high      776
medium    765
low       709
Name: count, dtype: int64
In [648]:
ytest = ytest.astype(str)
ypred = ypred.astype(str)
In [649]:
class_labels = xgb_multi.classes_.astype(str)  # Convert classes to strings
In [650]:
missing_labels = set(class_labels) - set(ytest)
print("Missing labels:", missing_labels)
Missing labels: {'2', '1', '0'}
In [651]:
filtered_class_labels = [label for label in class_labels if label in ytest]
In [652]:
if not filtered_class_labels:
    print("No matching labels found between `class_labels` and `ytest`.")
else:
    print(f"Filtered class labels: {filtered_class_labels}")
No matching labels found between `class_labels` and `ytest`.
In [653]:
print("Unique labels in ytest:", set(ytest))
print("Class labels:", class_labels)
Unique labels in ytest: {'low', 'medium', 'high'}
Class labels: ['0' '1' '2']
In [654]:
ytest = ytest.astype(str)  # Ensure ytest is of the same type as class_labels
ypred = ypred.astype(str)  # Ensure ypred is also consistent
In [655]:
filtered_class_labels = [label for label in class_labels if label in ytest]
print("Filtered class labels:", filtered_class_labels)
Filtered class labels: []
In [656]:
# Confusion Matrix

# cm = confusion_matrix(ytest, ypred, labels=xgb_multi.classes_)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=xgb_multi.classes_)
# disp.plot()
# plt.figure(figsize=(3,3))
# plt.show()
In [657]:
from sklearn.metrics import classification_report
print(classification_report(ytest, ypred))
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       0.0
           2       0.00      0.00      0.00       0.0
        high       0.00      0.00      0.00     776.0
         low       0.00      0.00      0.00     709.0
      medium       0.00      0.00      0.00     765.0

    accuracy                           0.00    2250.0
   macro avg       0.00      0.00      0.00    2250.0
weighted avg       0.00      0.00      0.00    2250.0

                                                       Bias-Variance Trade Off
In [658]:
from sklearn.metrics import accuracy_score

trainscore = accuracy_score(ytrain, xgb_multi.predict(xtrain))
testscore = accuracy_score(ytest, xgb_multi.predict(xtest))

print("Train  Acc Score - {}".format(round(trainscore,2)))
print("Test  Acc Score - {}".format(round(testscore,2)))
Train  Acc Score - 0.0
Test  Acc Score - 0.0
In [ ]:
 

Hyp Param Tuning¶

Model Deployment- Saving Trained Model¶

In [659]:
import joblib


joblib.dump(sc ,  "sc.pkl")
joblib.dump(log_multi, "log_multi.pkl")
Out[659]:
['log_multi.pkl']

Real Time Prediction¶

In [670]:
# Loading Saved model 

import joblib

sc = joblib.load("sc.pkl")
log_multi = joblib.load("log_multi.pkl")
In [671]:
pd.read_excel(r"C:\Users\mekal\Supply_Chain_FinalData.xlsx")
Out[671]:
Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price
0 8 839 481 0 0 98 40 387.89
1 8 351 332 0 1 52 35 891.03
2 5 398 156 0 0 34 43 933.21
3 5 972 418 1 0 32 45 628.57
4 3 529 428 0 0 44 41 70.33
... ... ... ... ... ... ... ... ...
8995 4 426 31 0 0 22 15 83.02
8996 8 172 432 0 0 11 45 116.94
8997 9 355 414 0 0 36 2 352.30
8998 4 842 107 0 0 53 58 361.41
8999 6 269 78 0 0 76 38 978.42

9000 rows × 8 columns

In [672]:
X.head()
Out[672]:
Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price
0 1.174541 1.110190 1.605327 -0.499305 -0.431374 1.647415 -0.327234 -0.400310
1 1.174541 -0.773367 0.577940 -0.499305 2.318176 0.058842 -0.501130 1.347573
2 0.011416 -0.591959 -0.635618 -0.499305 -0.431374 -0.562773 -0.222896 1.494105
3 0.011416 1.623537 1.170928 2.002782 -0.431374 -0.631841 -0.153338 0.435800
4 -0.764001 -0.086332 1.239881 -0.499305 -0.431374 -0.217431 -0.292454 -1.503498
In [673]:
X.dtypes
Out[673]:
Lead_Time            float64
Demand_Forecast      float64
Inventory_Level      float64
Stockout_Flag        float64
Backorder_Flag       float64
Order_Quantity       float64
Shipment_Quantity    float64
Product_Price        float64
dtype: object
In [674]:
vdata.head(2)
Out[674]:
Order_Date Shipment_Date Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price Order_Priority
0 2023-01-01 2023-01-02 8 839 481 0 0 98 40 387.89 high
1 2023-01-01 2023-01-02 8 351 332 0 1 52 35 891.03 high
In [675]:
def predictsupplychain():
    print("Supply_Chain_Input_Data :")
    import pandas as pd
    userinpdata = pd.read_excel(r"C:\Users\mekal\Supply_Chain_FinalData.xlsx")
    display(userinpdata.head())
    print("Logistic Regression built on the below X columns:")
    print()
    print(userinpdata.columns)
    print("======================= Enter Input Data ====================")
    print()
    LT = eval(input("Lead Time :"))
    print()
    DF = eval(input("Demand Forecast :"))
    print()
    IL = eval(input("How much stock level :"))
    print()
    SF = input("Stockout or not:")
    print()
    BOF = input("How many back orders:")
    print()
    OQ = input("How much order quantity:")
    print()
    SQ = input("Shipment Quantity:")
    print()
    PP = eval(input("Product Price:"))
    print()
    
     
    row = pd.DataFrame([[LT,DF,IL,SF,BOF,OQ,SQ,PP]], columns = userinpdata.columns)
    
    print("Given User Input Data:")
    display(row)
    print()
    
    ####### Data Pre-Processing #######
    
    # ohedata = ohe_multi.transform(row.select_dtypes("O")).toarray()
    # ohedata = pd.DataFrame(ohedata, columns=ohe_multi.get_feature_names_out())
    # row =  pd.concat([row.select_dtypes(exclude="O"), ohedata], axis=1)
    
    print("********** Prediction ***********")
    print()
    num_classes = log_multi.classes_.size
    print("Number of classes:", num_classes)
    prob0 = round(log_multi.predict_proba(row)[0][0], 2)
    prob1 = round(log_multi.predict_proba(row)[0][1], 2)
    prob2 = round(log_multi.predict_proba(row)[0][2], 2)
    print("Predicted Probabilities: low 0 - {}, medium 1 - {}, high 2 - {}".format(prob0, prob1, prob2))
    # probs = log_multi.predict_proba(row)[0]
    out = log_multi.predict(row)[0]
    print("Prediction:", out)
    
In [676]:
predictsupplychain()
Supply_Chain_Input_Data :
Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price
0 8 839 481 0 0 98 40 387.89
1 8 351 332 0 1 52 35 891.03
2 5 398 156 0 0 34 43 933.21
3 5 972 418 1 0 32 45 628.57
4 3 529 428 0 0 44 41 70.33
Logistic Regression built on the below X columns:

Index(['Lead_Time', 'Demand_Forecast', 'Inventory_Level', 'Stockout_Flag',
       'Backorder_Flag', 'Order_Quantity', 'Shipment_Quantity',
       'Product_Price'],
      dtype='object')
======================= Enter Input Data ====================








Given User Input Data:
Lead_Time Demand_Forecast Inventory_Level Stockout_Flag Backorder_Flag Order_Quantity Shipment_Quantity Product_Price
0 4 555 455 2 5 41 55 554.35
********** Prediction ***********

Number of classes: 3
Predicted Probabilities: low 0 - 0.0, medium 1 - 0.0, high 2 - 1.0
Prediction: medium
In [ ]: